Stats 506 PS5

Author

Alyssa Yang

Problem 1: OOP Programming

1a

# Make new rational class
setClass("rational",
    slots = list(
      a = "numeric",
      b = "numeric"
    ),
    validity = function(object) { # Validator
      if (is.null(object@a)) {
        stop("Rational must have a numerator")
      }
      if (object@b == 0) {
        return("Denominator cannot be zero.")
      }
      return(TRUE)
    }
)
# Constructor
rational <- function(a, b = 1) {
  new("rational", a = a, b = b)
}
# Show method
setMethod("show", "rational",
  function(object) {
    if (object@b == 1) {
      print(object@a)
    }
    else if (object@a == 0) {
      print(0)
    }
    else {
      cat(paste0(object@a, "/", object@b, "\n"))
    }
    return(invisible(object))
  }
)
# GCD and LCM in RCpp
library(Rcpp)

cppFunction("
  #include <numeric>`
  int C_gcd(int a, int b) {
    return std::gcd(a, b);
  }")

cppFunction("
  #include <numeric>
  int C_lcm(int a, int b) {
    return std::lcm(a, b);
  }")
# Simplify method
setGeneric("simplify", 
  function(object, ...) {
    standardGeneric("simplify")
  })
[1] "simplify"
setMethod("simplify", "rational",
  function(object, print_result = TRUE) {
    gcd <- C_gcd(object@a, object@b)
    object@a <- object@a / gcd
    object@b <- object@b / gcd
    if (print_result) {
      show(object)
    }
    return(invisible(object))
  })
# Quotient method
setGeneric("quotient",
  function(object, digits = 4) {
    standardGeneric("quotient")
  })
[1] "quotient"
setMethod("quotient", "rational",
  function(object, digits = 4) {
    if (!is.numeric(digits) || digits != as.integer(digits) || digits < 0) {
      stop("Digits must be a non-negative integer.")
    }
    
    result <- object@a / object@b
    print(sprintf(paste0("%.", digits, "f"), result))
    return(invisible(result))
  })
# +, -, *, /
setMethod("+", signature(e1 = "rational",
                         e2 = "rational"),
          function(e1, e2) {
            lcm <- C_lcm(e1@b, e2@b)
            num1 <- (lcm / e1@b) * e1@a
            num2 <- (lcm / e2@b) * e2@a
            return(simplify(rational(a = num1 + num2, b = lcm), FALSE))
          })

setMethod("-", signature(e1 = "rational",
                         e2 = "rational"),
          function(e1, e2) {
            lcm <- C_lcm(e1@b, e2@b)
            num1 <- (lcm / e1@b) * e1@a
            num2 <- (lcm / e2@b) * e2@a
            return(simplify(rational(a = num1 - num2, b = lcm), FALSE))
          })

setMethod("*", signature(e1 = "rational",
                         e2 = "rational"),
          function(e1, e2) {
            return(simplify(rational(a = e1@a * e2@a, b = e1@b * e2@b), FALSE))
          })

setMethod("/", signature(e1 = "rational",
                         e2 = "rational"),
          function(e1, e2) {
            return(simplify(rational(a = e1@a * e2@b, b = e1@b * e2@a), FALSE))
          })

1b

r1 <- rational(a = 24, b = 6)
r2 <- rational(a = 7, b = 230)
r3 <- rational(a = 0, b = 4)
r1
24/6
r3
[1] 0
r1 + r2
927/230
r1 - r2
913/230
r1 * r2
14/115
r1 / r2
920/7
r1 + r3
[1] 4
r1 * r3
[1] 0
r2 / r3
Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'simplify': invalid class "rational" object: Denominator cannot be zero.
quotient(r1)
[1] "4.0000"
quotient(r2)
[1] "0.0304"
quotient(r2, digits = 3)
[1] "0.030"
quotient(r2, digits = 3.14)
Error in quotient(r2, digits = 3.14): Digits must be a non-negative integer.
quotient(r2, digits = "avocado")
Error in quotient(r2, digits = "avocado"): Digits must be a non-negative integer.
q2 <- quotient(r2, digits = 3)
[1] "0.030"
q2
[1] 0.03043478
quotient(r3)
[1] "0.0000"
simplify(r1)
[1] 4
simplify(r2)
7/230
simplify(r3)
[1] 0

1c

# Check for no creation of 0's in denominator
t1 <- rational(a = 2, b = 0)
Error in validObject(.Object): invalid class "rational" object: Denominator cannot be zero.
t2 <- rational(a = 0, b = 0)
Error in validObject(.Object): invalid class "rational" object: Denominator cannot be zero.
t3 <- rational(a = 1, b = 2)
t4 <- rational(a = 0, b = 4)
t5 <- t3 / t4
Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'simplify': invalid class "rational" object: Denominator cannot be zero.
# Check for other malformed inputs
t6 <- rational(a = "a", b = "b")
Error in validObject(.Object): invalid class "rational" object: 1: invalid object for slot "a" in class "rational": got class "character", should be or extend class "numeric"
invalid class "rational" object: 2: invalid object for slot "b" in class "rational": got class "character", should be or extend class "numeric"
t7 <- rational(a = "3", b = "4")
Error in validObject(.Object): invalid class "rational" object: 1: invalid object for slot "a" in class "rational": got class "character", should be or extend class "numeric"
invalid class "rational" object: 2: invalid object for slot "b" in class "rational": got class "character", should be or extend class "numeric"
t9 <- rational(a = 2)
t10 <- rational(b = 2)
Error in rational(b = 2): argument "a" is missing, with no default
t11 <- rational()
Error in rational(): argument "a" is missing, with no default

Problem 2: plotly

2a

Code
suppressMessages(library(dplyr))
library(tidyr)
library(stringr)
library(ggplot2)
suppressMessages(library(plotly))

art_sales <- read.csv("df_for_ml_improved_new_market.csv")
Code
# Collapse genre columns into one column
art_sales_genres <- art_sales %>%
  pivot_longer(cols = starts_with("Genre___"),
               names_to = "genre",
               values_to = "count")

# Rename genre values and select relevant columns
art_sales_genres <- art_sales_genres %>%
  mutate(genre = str_remove(genre, "Genre___")) %>%
  filter(count > 0) %>%
  select(year, genre)

# Calculate proportions of genre of sales across years
genre_dist <- art_sales_genres %>%
  group_by(year, genre) %>%
  summarize(num_sales = n()) %>%
  mutate(proportion = num_sales / sum(num_sales))
`summarise()` has grouped output by 'year'. You can override using the
`.groups` argument.
Code
# Plot genre distribution across years
ggplot(genre_dist, aes(x = year, y = proportion, fill = genre)) +
  geom_area(alpha = 0.75, size = 0.5, color = "white") +
  labs(title = "Distribution of Genre of Sales Across Years",
       x = "Year",
       y = "Proportion of Sales")
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

2b

Code
# Collapse genre columns into one column
art_sales_genres <- art_sales %>%
  pivot_longer(cols = starts_with("Genre___"),
               names_to = "genre",
               values_to = "count")

# Rename genre values and select relevant columns
art_sales_genres <- art_sales_genres %>%
  mutate(genre = str_remove(genre, "Genre___")) %>%
  filter(count > 0) %>%
  select(year, genre, price_usd)

# Calculate average sales price by year and genre
avg_price_genre <- art_sales_genres %>%
  group_by(year, genre) %>%
  summarize(avg_sales_price = mean(price_usd, na.rm = TRUE))
`summarise()` has grouped output by 'year'. You can override using the
`.groups` argument.
Code
# Filter genre dataframe for each genre to plot
others <- avg_price_genre %>%
  filter(genre == "Others")

painting <- avg_price_genre %>%
  filter(genre == "Painting")

photography <- avg_price_genre %>%
  filter(genre == "Photography")

print <- avg_price_genre %>%
  filter(genre == "Print")

sculpture <- avg_price_genre %>%
  filter(genre == "Sculpture")
Code
library(plotly)

# Create the plot with both sales price and genre average lines
p <- plot_ly() |>
  add_markers(data = art_sales,
              x = ~year, y = ~price_usd, 
              name = "Sales Price", 
              marker = list(opacity = 0.5, color = "gray")) |>
  add_lines(data = art_sales,
            x = ~year, y = ~fitted(loess(price_usd ~ year, data = art_sales)), 
            name = "Trend line",
            line = list(color = "red")) |>
  add_markers(data = others,
            x = ~year, y = ~avg_sales_price,
            name = "Others", mode = "lines",
            line = list(color = "orange"), marker = list(opacity = 0)) |>
  add_markers(data = painting,
            x = ~year, y = ~avg_sales_price,
            name = "Painting", mode = "lines",
            line = list(color = "yellow"), marker = list(opacity = 0)) |>
  add_markers(data = photography,
            x = ~year, y = ~avg_sales_price,
            name = "Photography", mode = "lines",
            line = list(color = "green"), marker = list(opacity = 0)) |>
  add_markers(data = print,
            x = ~year, y = ~avg_sales_price,
            name = "Print", mode = "lines",
            line = list(color = "blue"), marker = list(opacity = 0)) |>
  add_markers(data = sculpture,
            x = ~year, y = ~avg_sales_price,
            name = "Sculpture", mode = "lines",
            line = list(color = "purple"), marker = list(opacity = 0)) |>
  layout(
    title = "Change in Sales Price in USD Over Time",
    xaxis = list(title = "Year"),
    yaxis = list(title = "Sales Price in USD", range = c(0, 100000)),
    showlegend = TRUE
  )
Code
# Create menus to switch between overall trend and genres
p |> layout(updatemenus = list(
  list(
    y = 0.95,
    buttons = list(
      list(method = "update",
           args = list(list(visible = list(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE))),
           label = "All lines"),
      list(method = "update",
           args = list(list(visible = list(TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE))),
           label = "Overall trend"),
      list(method = "update",
           args = list(list(visible = list(TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE))),
           label = "All genres"),
      list(method = "update",
           args = list(list(visible = list(TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE))),
           label = "Others"),
      list(method = "update",
           args = list(list(visible = list(TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE))),
           label = "Painting"),
      list(method = "update",
           args = list(list(visible = list(TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE))),
           label = "Photography"),
      list(method = "update",
           args = list(list(visible = list(TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE))),
           label = "Print"),
      list(method = "update",
           args = list(list(visible = list(TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE))),
           label = "Sculpture")
    )
  )
))
A line object has been specified, but lines is not in the mode
Adding lines to the mode...
A line object has been specified, but lines is not in the mode
Adding lines to the mode...
A line object has been specified, but lines is not in the mode
Adding lines to the mode...
A line object has been specified, but lines is not in the mode
Adding lines to the mode...
A line object has been specified, but lines is not in the mode
Adding lines to the mode...

Problem 3:

3a

suppressMessages(library(data.table))
library(nycflights13)
flights <- data.table(flights)
airports <- data.table(airports)
planes <- data.table(planes)
# Departure delays
flights_dep <- merge(flights[, faa := origin],
                     airports,
                     by = "faa",
                     all.x = TRUE)

flights_dep[, .(mean_delay = mean(dep_delay, na.rm = TRUE),
                median_delay = median(dep_delay, na.rm = TRUE),
                num_flights = .N),
            by = name] |>
  _[num_flights >= 10] |>
  _[,!"num_flights"] |>
  _[order(mean_delay, decreasing = TRUE)]
                  name mean_delay median_delay
                <char>      <num>        <num>
1: Newark Liberty Intl   15.10795           -1
2: John F Kennedy Intl   12.11216           -1
3:          La Guardia   10.34688           -3
# Arrival delays
flights_arr <- merge(flights[, faa := dest],
                     airports,
                     by = "faa",
                     all.x = TRUE)

flights_arr[, .(name = ifelse(is.na(name[1]), faa[1], name[1]),
                mean_delay = mean(arr_delay, na.rm = TRUE),
                med_delay = median(arr_delay, na.rm = TRUE),
                num_flights = .N),
            by = faa] |>
  _[num_flights >= 10] |>
  _[, !c("faa", "num_flights")] |>
  _[order(mean_delay, decreasing = TRUE)] |>
  print(x = _, nrows = 110)
                                     name   mean_delay med_delay
                                   <char>        <num>     <num>
  1:                Columbia Metropolitan  41.76415094      28.0
  2:                           Tulsa Intl  33.65986395      14.0
  3:                    Will Rogers World  30.61904762      16.0
  4:                 Jackson Hole Airport  28.09523810      15.0
  5:                        Mc Ghee Tyson  24.06920415       2.0
  6:               Dane Co Rgnl Truax Fld  20.19604317       1.0
  7:                        Richmond Intl  20.11125320       1.0
  8:        Akron Canton Regional Airport  19.69833729       3.0
  9:                      Des Moines Intl  19.00573614       0.0
 10:                   Gerald R Ford Intl  18.18956044       1.0
 11:                      Birmingham Intl  16.87732342      -2.0
 12:         Theodore Francis Green State  16.23463687       1.0
 13: Greenville-Spartanburg International  15.93544304      -0.5
 14:    Cincinnati Northern Kentucky Intl  15.36456376      -3.0
 15:            Savannah Hilton Head Intl  15.12950601      -1.0
 16:          Manchester Regional Airport  14.78755365      -3.0
 17:                          Eppley Afld  14.69889841      -2.0
 18:                               Yeager  14.67164179      -1.5
 19:                     Kansas City Intl  14.51405836       0.0
 20:                          Albany Intl  14.39712919      -4.0
 21:                General Mitchell Intl  14.16722038       0.0
 22:                       Piedmont Triad  14.11260054      -2.0
 23:               Washington Dulles Intl  13.86420212      -3.0
 24:               Cherry Capital Airport  12.96842105     -10.0
 25:              James M Cox Dayton Intl  12.68048606      -3.0
 26:     Louisville International Airport  12.66938406      -2.0
 27:                  Chicago Midway Intl  12.36422360      -1.0
 28:                      Sacramento Intl  12.10992908       4.0
 29:                    Jacksonville Intl  11.84483416      -2.0
 30:                       Nashville Intl  11.81245891      -2.0
 31:                Portland Intl Jetport  11.66040210      -4.0
 32:               Greater Rochester Intl  11.56064461      -5.0
 33:      Hartsfield Jackson Atlanta Intl  11.30011285      -1.0
 34:                Lambert St Louis Intl  11.07846451      -3.0
 35:                         Norfolk Intl  10.94909344      -4.0
 36:            Baltimore Washington Intl  10.72673385      -5.0
 37:                         Memphis Intl  10.64531435      -2.5
 38:                   Port Columbus Intl  10.60132291      -3.0
 39:                  Charleston Afb Intl  10.59296847      -4.0
 40:                    Philadelphia Intl  10.12719014      -3.0
 41:                  Raleigh Durham Intl  10.05238095      -3.0
 42:                    Indianapolis Intl   9.94043412      -3.0
 43:            Charlottesville-Albemarle   9.50000000      -5.0
 44:               Cleveland Hopkins Intl   9.18161129      -5.0
 45:        Ronald Reagan Washington Natl   9.06695204      -2.0
 46:                      Burlington Intl   8.95099602      -4.0
 47:                 Buffalo Niagara Intl   8.94595186      -5.0
 48:                Syracuse Hancock Intl   8.90392501      -5.0
 49:                          Denver Intl   8.60650021      -2.0
 50:                      Palm Beach Intl   8.56297210      -3.0
 51:                                  BQN   8.24549550      -1.0
 52:                             Bob Hope   8.17567568      -3.0
 53:       Fort Lauderdale Hollywood Intl   8.08212154      -3.0
 54:                          Bangor Intl   8.02793296      -9.0
 55:           Asheville Regional Airport   8.00383142      -1.0
 56:                                  PSE   7.87150838       0.0
 57:                      Pittsburgh Intl   7.68099053      -5.0
 58:                       Gallatin Field   7.60000000      -2.0
 59:                 NW Arkansas Regional   7.46572581      -2.0
 60:                           Tampa Intl   7.40852503      -4.0
 61:               Charlotte Douglas Intl   7.36031885      -3.0
 62:             Minneapolis St Paul Intl   7.27016886      -5.0
 63:                      William P Hobby   7.17618819      -4.0
 64:                         Bradley Intl   7.04854369     -10.0
 65:                     San Antonio Intl   6.94537178      -9.0
 66:                      South Bend Rgnl   6.50000000      -3.5
 67:     Louis Armstrong New Orleans Intl   6.49017497      -6.0
 68:                        Key West Intl   6.35294118       7.0
 69:                        Eagle Co Rgnl   6.30434783      -4.0
 70:                Austin Bergstrom Intl   6.01990875      -5.0
 71:                   Chicago Ohare Intl   5.87661475      -8.0
 72:                         Orlando Intl   5.45464309      -5.0
 73:               Detroit Metro Wayne Co   5.42996346      -7.0
 74:                        Portland Intl   5.14157973      -5.0
 75:                        Nantucket Mem   4.85227273      -3.0
 76:                      Wilmington Intl   4.63551402      -7.0
 77:                    Myrtle Beach Intl   4.60344828     -13.0
 78:    Albuquerque International Sunport   4.38188976      -5.5
 79:         George Bush Intercontinental   4.24079040      -5.0
 80:        Norman Y Mineta San Jose Intl   3.44817073      -7.0
 81:               Southwest Florida Intl   3.23814963      -5.0
 82:                       San Diego Intl   3.13916574      -5.0
 83:              Sarasota Bradenton Intl   3.08243131      -5.0
 84:            Metropolitan Oakland Intl   3.07766990      -9.0
 85:   General Edward Lawrence Logan Intl   2.91439222      -9.0
 86:                   San Francisco Intl   2.67289152      -8.0
 87:                                  SJU   2.52052659      -6.0
 88:                         Yampa Valley   2.14285714       2.0
 89:              Phoenix Sky Harbor Intl   2.09704733      -6.0
 90:            Montrose Regional Airport   1.78571429     -10.5
 91:                     Los Angeles Intl   0.54711094      -7.0
 92:               Dallas Fort Worth Intl   0.32212685      -9.0
 93:                           Miami Intl   0.29905978      -9.0
 94:                       Mc Carran Intl   0.25772849      -8.0
 95:                  Salt Lake City Intl   0.17625459      -8.0
 96:                           Long Beach  -0.06202723     -10.0
 97:                Martha\\\\'s Vineyard  -0.28571429     -11.0
 98:                  Seattle Tacoma Intl  -1.09909910     -11.0
 99:                        Honolulu Intl  -1.36519258      -7.0
100:                                  STT  -3.83590734      -9.0
101:            John Wayne Arpt Orange Co  -7.86822660     -11.0
102:                    Palm Springs Intl -12.72222222     -13.5
                                     name   mean_delay med_delay

3b

flights_planes <- merge(flights,
                        planes,
                        by = "tailnum",
                        all.x = TRUE)

flights_planes[, .(avg_mph = mean(distance/(air_time/60)),
                   num_flights = .N),
               by = model] |>
  _[order(avg_mph, decreasing = TRUE)] |>
  _[head(1)]
     model  avg_mph num_flights
    <char>    <num>       <int>
1: 777-222 482.6254           4